# coding:utf-8

import urllib2
import re
import sys
class Spider:
    def __init__(self):
        self.page = 1
        self.is_continue = True
        self.url_base = 'http://www.neihan.net/text_'
        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}

    def getPage(self,url):
        print('正在下载页面...')
        request = urllib2.Request(url,headers=self.headers)
        response = urllib2.urlopen(request)
        page = response.read()
        with open('page.html','w') as f:
            f.write(page)
        self.getPageUsefulContext(page)

    def getPageUsefulContext(self,page):
        pattern = re.compile(r'<dd\sclass="content">(.*?)</dd>',re.S)
        content_list = pattern.findall(page)
        for item in content_list:
            data = item.replace('<p>','').replace('</p>','').replace('<br/>','')
            self.savePageContext(data)

    def savePageContext(self,data):
        print('正在保存数据...')
        with open('content.txt','a') as f:
            f.write('\t'+data)

    def controlWork(self):
        while self.is_continue:
            url = self.url_base + str(self.page) + '.html'
            input = raw_input('按任意键继续抓取下一页数据，输入quit退出程序\n')
            if input == 'quit':
                sys.exit(0)

            self.getPage(url)
            self.page += 1

if __name__ == '__main__':
    s = Spider()
    s.controlWork()